Exploring the Essemble of Random Forests¶
In [1]:
import numpy as np
import pandas as pd
import os
import sys
sys.path.append('/Users/krish/Desktop/DYNAMIC MODEL VEGETATION PROJECT/au_dyanamic_vegetation_project/STEP9_DATA_MODELLING_AND_EXPLORATION')
import matplotlib.pyplot as plt
from sklearn.metrics import root_mean_squared_error, mean_squared_error
import seaborn as sns
sns.set()
sns.set_style("whitegrid")
sns.set_context("paper")
In [2]:
results_dir = 'E:/Krish_New/Dynamic_Vegetation_Project_Storage/Random_Forest_Results_On_Super_Group_Results'
results_dir = 'C:/Users/krish/Desktop/DYNAMIC MODEL VEGETATION PROJECT/au_dyanamic_vegetation_project/RESULTS/Random_Forest_Results_On_Super_Group_Results_new'
directory = 'C:/Users/krish/Desktop/DYNAMIC MODEL VEGETATION PROJECT/au_dyanamic_vegetation_project/DATASETS/MODELLED_TRAINING_DATA'
plots_dir = 'C:/Users/krish/Desktop/DYNAMIC MODEL VEGETATION PROJECT/Thesis/Plots_For_Thesis/Chapter 3'
In [3]:
SEASONAL_FEATURES = ['photoperiod', 'photoperiod_gradient']
PRECIP_FEATURES = ['precip_30', 'precip_90', 'precip_180',
'precip_365', 'precip_730', 'precip_1095',
'precip_1460']
MEAN_ANNUAL_CLIMATE_FEATURES = ['MAT', 'MAP']
TEMP_FEATURES = ['tmax_lag', 'tmax_7', 'tmax_14',
'tmax_30', 'tmin_lag', 'tmin_7',
'tmin_14', 'tmin_30']
VPD_FEATURES = ['VPD_lag','VPD_7', 'VPD_14',
'VPD_30']
FIRE_FEATURES = ['days_since_fire', 'fire_severity']
CO2_FEATURES = ['CO2']
SOIL_FEATURES = ['SLGA_1','SLGA_2','SLGA_3', 'DER_000_999'] # the soil attributes to include
TOPOGRAPHIC_FEATURES = ['aspect_1s', 'twi_1s']
FEATURES = SEASONAL_FEATURES + PRECIP_FEATURES + VPD_FEATURES + FIRE_FEATURES + CO2_FEATURES + TEMP_FEATURES + SOIL_FEATURES + TOPOGRAPHIC_FEATURES
TEMPORAL_FEATURES = SEASONAL_FEATURES + PRECIP_FEATURES + VPD_FEATURES + FIRE_FEATURES + CO2_FEATURES + TEMP_FEATURES
SPATIAL_FEATURES = SOIL_FEATURES + TOPOGRAPHIC_FEATURES + MEAN_ANNUAL_CLIMATE_FEATURES
In [4]:
def plotPredictions(df, TARGET, n_folds = 10, directory_plot_output = '', msg = '', split = '', fire_split = ''):
fig, ax = plt.subplots(nrows = 3, figsize = (15,10))
fig.suptitle(msg, fontsize=30)
for i,v in enumerate(TARGET):
for folder_num in range(n_folds):
df[f'{v}_prediction_{folder_num + 1}'].plot(ax=ax[i], ylim = (0,100), label = f'Modelled {v.split("_")[0]}')
df[v].plot(ax=ax[i], color = 'black', label = f'Observed {v.split("_")[0]}')
ax[i].legend()
ax[i].grid(True)
if split:
for s in split:
ax[i].axvline(s, color='black', ls='--')
if fire_split:
for f in fire_split:
ax[i].axvline(f, color='red', ls='--')
def plotMedianPredictions(df, TARGET, n_folds = 10, directory_plot_output = '', msg = '', split = '', fire_split = ''):
# Plot the mean/median
fig, ax = plt.subplots(nrows = 3, figsize = (15,10), sharex = True)
fig.suptitle(msg, fontsize=30)
for i,v in enumerate(TARGET):
df[f'{v.split("_")[0]}_median'].plot(ax=ax[i], ylim = (0,100), label = f'Modelled {v.split("_")[0]} median')
ax[i].fill_between(df.index, df[f'{v.split("_")[0]}_lower'].values, df[f'{v.split("_")[0]}_upper'].values, color='blue', alpha=0.3, label='2.5%-97.5% Quantiles')
df[v].plot(ax=ax[i], color = 'black', label = f'Observed {v.split("_")[0]}')
ax[i].legend()
ax[i].grid(True)
if split:
for s in split:
ax[i].axvline(s, color='black', ls='--')
if fire_split:
for f in fire_split:
ax[i].axvline(f, color='red', ls='--')
if directory_plot_output:
plt.savefig(fname = directory_plot_output)
In [5]:
super_group_list = ['Desert Chenopod', 'Desert Forb', 'Desert Hummock.grass',
'Desert Shrub', 'Desert Tree.Palm', 'Desert Tussock.grass',
'Temp/Med Shrub', 'Temp/Med Tree.Palm', 'Temp/Med Tussock.grass',
'Tropical/Savanna Tree.Palm', 'Tropical/Savanna Tussock.grass']
Get the Variable Importance¶
In [6]:
chosen_super_group = super_group_list[5]
super_group_folder_name = '_'.join(chosen_super_group.split('/'))
print(f'Looking at {super_group_folder_name}')
Looking at Desert Tussock.grass
In [7]:
var_path = f'{results_dir}/{super_group_folder_name}/Results/Variable_Importances'
test_pred_path = f'{results_dir}/{super_group_folder_name}/Results/Test_Predictions/test_predictions.csv'
In [8]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(15, 12))
for i, ax in enumerate(axes.flat):
if i < 10:
folder_num = i + 1
per_importance = pd.read_csv(f'{var_path}/KFold_{folder_num}_RF_VariableImportance.csv', index_col = 0)
per_importance['importances_mean'].plot.barh(ax = ax)
#ax.grid(True)
ax.set_xlabel('Mean Gain in MSE')
ax.title.set_text(f'Model {folder_num}')
else:
ax.axis('off')
plt.tight_layout()
fig.savefig(f'{plots_dir}/{super_group_folder_name}_Individual_Model_Importances.png')
Aggregating the Group Variable Means by the average score¶
In [9]:
number_of_folds = 10
variable_common_rank = dict()
# Give features that were not in the model a score of 0
for f in range(number_of_folds):
folder_num = f + 1
per_importance = pd.read_csv(f'{var_path}/KFold_{folder_num}_RF_VariableImportance.csv')
per_importance = per_importance.sort_values('importances_mean', ascending = False).reset_index(drop = True)
per_importance['rank'] = per_importance.index
feature_track = []
for i, row in per_importance.iterrows():
var = row['Unnamed: 0']
rank = row['importances_mean']
#print(var)
if var in variable_common_rank.keys():
variable_common_rank[var].append(rank)
else:
variable_common_rank[var] = [rank]
feature_track.append(var)
features_not_found = []
for var in FEATURES:
if var not in feature_track:
if var in variable_common_rank.keys():
variable_common_rank[var].append(0)
else:
variable_common_rank[var] = [0]
features_not_found.append(var)
print(f'{features_not_found} was not found in Model {f+1}')
['precip_30', 'precip_90', 'precip_180', 'precip_730', 'precip_1095', 'precip_1460', 'VPD_lag', 'VPD_14', 'days_since_fire', 'fire_severity', 'tmax_lag', 'tmax_7', 'tmax_14', 'tmax_30', 'tmin_14', 'tmin_30', 'SLGA_3', 'DER_000_999', 'twi_1s'] was not found in Model 1 ['photoperiod', 'precip_30', 'precip_90', 'precip_180', 'precip_365', 'precip_730', 'precip_1095', 'precip_1460', 'VPD_lag', 'VPD_7', 'VPD_14', 'days_since_fire', 'fire_severity', 'tmax_lag', 'tmax_7', 'tmax_30', 'tmin_lag', 'tmin_7', 'tmin_30', 'SLGA_3', 'DER_000_999', 'aspect_1s', 'twi_1s'] was not found in Model 2 ['precip_90', 'precip_365', 'precip_730', 'VPD_7', 'VPD_14', 'VPD_30', 'days_since_fire', 'fire_severity', 'tmax_7', 'tmax_14', 'tmax_30', 'tmin_lag', 'tmin_7', 'tmin_14', 'tmin_30', 'DER_000_999', 'aspect_1s', 'twi_1s'] was not found in Model 3 ['photoperiod', 'precip_30', 'precip_90', 'precip_180', 'precip_730', 'precip_1095', 'precip_1460', 'VPD_lag', 'VPD_14', 'VPD_30', 'fire_severity', 'tmax_lag', 'tmax_7', 'tmax_14', 'tmin_lag', 'tmin_7', 'tmin_14', 'twi_1s'] was not found in Model 4 ['photoperiod_gradient', 'precip_180', 'precip_730', 'precip_1095', 'precip_1460', 'VPD_lag', 'VPD_7', 'VPD_14', 'days_since_fire', 'fire_severity', 'CO2', 'tmax_lag', 'tmax_7', 'tmax_14', 'tmax_30', 'tmin_7', 'tmin_14', 'tmin_30', 'SLGA_3', 'aspect_1s', 'twi_1s'] was not found in Model 5 ['photoperiod', 'VPD_lag', 'VPD_7', 'VPD_14', 'days_since_fire', 'fire_severity', 'CO2', 'tmax_lag', 'tmax_7', 'tmax_14', 'tmax_30', 'tmin_lag', 'tmin_7', 'tmin_30', 'aspect_1s', 'twi_1s'] was not found in Model 6 ['photoperiod_gradient', 'precip_180', 'precip_365', 'precip_1460', 'VPD_14', 'days_since_fire', 'fire_severity', 'tmax_lag', 'tmax_7', 'tmax_14', 'tmin_lag', 'tmin_7', 'tmin_14', 'SLGA_2', 'SLGA_3', 'aspect_1s', 'twi_1s'] was not found in Model 7 ['precip_365', 'precip_730', 'precip_1095', 'VPD_lag', 'VPD_7', 'VPD_14', 'VPD_30', 'days_since_fire', 'fire_severity', 'tmax_lag', 'tmax_7', 'tmax_14', 'tmax_30', 'tmin_lag', 'tmin_7', 'SLGA_3', 'DER_000_999', 'aspect_1s', 'twi_1s'] was not found in Model 8 ['photoperiod', 'precip_90', 'precip_180', 'precip_730', 'precip_1095', 'precip_1460', 'VPD_7', 'VPD_14', 'VPD_30', 'days_since_fire', 'fire_severity', 'tmax_lag', 'tmax_7', 'tmax_30', 'tmin_lag', 'tmin_7', 'SLGA_2', 'DER_000_999'] was not found in Model 9 ['precip_30', 'precip_90', 'precip_180', 'precip_730', 'precip_1095', 'precip_1460', 'VPD_lag', 'VPD_7', 'VPD_14', 'VPD_30', 'days_since_fire', 'fire_severity', 'tmax_lag', 'tmax_7', 'tmax_14', 'tmax_30', 'tmin_lag', 'tmin_7', 'tmin_14', 'SLGA_3', 'DER_000_999', 'aspect_1s', 'twi_1s'] was not found in Model 10
In [10]:
collapsed_importances = dict()
for key, value in variable_common_rank.items():
collapsed_importances[key] = np.mean(value)
collapsed_importance = pd.DataFrame.from_dict(collapsed_importances, orient = 'index')
collapsed_importance = collapsed_importance.sort_values(0)
In [11]:
number_of_folds = 10
fig, axes = plt.subplots(ncols= 2, figsize = (10,5))
spatial = collapsed_importance.iloc[collapsed_importance.index.isin(SPATIAL_FEATURES)]
spatial[0].plot.barh(ax = axes[0], legend=False)
temporal = collapsed_importance.iloc[collapsed_importance.index.isin(TEMPORAL_FEATURES)]
temporal.plot.barh(ax = axes[1], legend=False)
for ax in axes.flat:
ax.grid(True)
ax.set_xlabel('Mean Gain in MSE')
fig.suptitle(f'{super_group_folder_name}', fontsize = 15)
plt.tight_layout()
fig.savefig(f'{plots_dir}/{super_group_folder_name}_Aggregated_Model_Importances.png')
Assessing the Predictions¶
In [12]:
number_of_folds = 10
test_set = pd.read_csv(test_pred_path, parse_dates = ['time']).set_index('time').sort_values('time')
predictor_names_pv = [f'pv_filter_prediction_{j + 1}' for j in range(number_of_folds)]
pv_summaries = test_set[predictor_names_pv].T.describe(percentiles = [0.025, 0.5, 0.975]).T
test_set['pv_median'] = pv_summaries['50%'].values
test_set['pv_mean'] = pv_summaries['mean'].values
test_set['pv_lower'] = pv_summaries['2.5%'].values
test_set['pv_upper'] = pv_summaries['97.5%'].values
predictor_names_pv = [f'npv_filter_prediction_{j + 1}' for j in range(number_of_folds)]
pv_summaries = test_set[predictor_names_pv].T.describe(percentiles = [0.025, 0.5, 0.975]).T
test_set['npv_median'] = pv_summaries['50%'].values
test_set['npv_mean'] = pv_summaries['mean'].values
test_set['npv_lower'] = pv_summaries['2.5%'].values
test_set['npv_upper'] = pv_summaries['97.5%'].values
predictor_names_pv = [f'bs_filter_prediction_{j + 1}' for j in range(number_of_folds)]
pv_summaries = test_set[predictor_names_pv].T.describe(percentiles = [0.025, 0.5, 0.975]).T
test_set['bs_median'] = pv_summaries['50%'].values
test_set['bs_mean'] = pv_summaries['mean'].values
test_set['bs_lower'] = pv_summaries['2.5%'].values
test_set['bs_upper'] = pv_summaries['97.5%'].values
In [13]:
print('Mean MSEs')
pv_mean_rmse = round(root_mean_squared_error(test_set['pv_mean'], test_set['pv_filter']),3)
npv_mean_rmse = round(root_mean_squared_error(test_set['npv_mean'], test_set['npv_filter']),3)
bs_mean_rmse = round(root_mean_squared_error(test_set['bs_mean'], test_set['bs_filter']),3)
print(f'PV: {pv_mean_rmse}')
print(f'NPV: {npv_mean_rmse}')
print(f'BS: {bs_mean_rmse}')
Mean MSEs PV: 10.251 NPV: 11.095 BS: 17.298
In [14]:
TARGET = ['pv_filter', 'npv_filter', 'bs_filter'] # Only needed target variables
In [15]:
sites_unique = np.unique(test_set['site_location_name'])
for sit in sites_unique:
site_specific = test_set[test_set['site_location_name'] == sit]
site_pv_rmse = round(root_mean_squared_error(site_specific['pv_median'], site_specific['pv_filter']),2)
site_npv_rmse = round(root_mean_squared_error(site_specific['npv_median'], site_specific['npv_filter']),2)
site_bs_rmse = round(root_mean_squared_error(site_specific['bs_median'], site_specific['bs_filter']),2)
#plotPredictions(test_set[test_set['site_location_name'] == sit], n_folds = number_of_folds,
# TARGET = TARGET, msg = f'{sit}, [pv_rmse: {site_pv_rmse}, npv_rmse: {site_npv_rmse}, bs_rmse: {site_bs_rmse}]')
#
plotMedianPredictions(test_set[test_set['site_location_name'] == sit], n_folds = number_of_folds,
TARGET = TARGET, msg = f'{sit}, [pv_rmse: {site_pv_rmse}, npv_rmse: {site_npv_rmse}, bs_rmse: {site_bs_rmse}]',
directory_plot_output = f'{plots_dir}/{super_group_folder_name}_{sit}_Median_Predictions.png')
Inspect Linear Alignment¶
In [16]:
fig, axes = plt.subplots(nrows = 2, ncols= 2, figsize = (10,10))
fractions = ['pv', 'npv', 'bs']
for i, ax in enumerate(axes.flat):
if i < 3:
one_to_one = [i for i in range(101)]
sns.regplot(data = test_set, x = f'{fractions[i]}_filter', y = f'{fractions[i]}_median', ax = ax, line_kws=dict(color="r"))
ax.plot(one_to_one,one_to_one, color = 'black')
ax.grid(True)
ax.set_xlabel(f'{fractions[i].upper()}')
ax.set_xlabel(f'Modelled {fractions[i].upper()}')
ax.set_xlim([0, 100])
ax.set_ylim([0, 100])
ax.set_aspect('equal')
else:
ax.axis('off')